# -*- coding: utf-8 -*-
import scrapy
from bs4 import BeautifulSoup
from scrapy import Request
from scrapydaxue.items import daxueItem

class DaxueSpider(scrapy.Spider):
    name = 'daxue'
    #allowed_domains = ['daxue.eol.cn']
    start_urls = ['https://daxue.eol.cn/mingdan.shtml']

    def parse(self, response):
        soup = BeautifulSoup(response.text, 'lxml')
        gx_url_list = soup.find_all(name='div', class_='province')
        for gx_url in gx_url_list:
            url = gx_url.a['href']
            yield Request(url = url,callback = self.parse_daxue_list)

    def parse_daxue_list(self,response):
        #response.encoding = 'utf-8'  # 解决中文乱码
        soup = BeautifulSoup(response.text, 'lxml')

        prov = soup.find_all(name='div', class_='title')[0].text.replace('正规高校名单', '')

        dx_list = soup.find_all(name='table', class_='table-x')
        for dx_details in dx_list:
            for dx_detail in dx_details.find_all(name='tr'):
                # print(dx_detail)
                # print(type(dx_detail))
                if (dx_detail == dx_details.find_all(name='tr')[0] or dx_detail == dx_details.find_all(name='tr')[1]):
                    continue
                else:
                    daxue = daxueItem()
                    daxue['province'] = prov
                    daxue['name'] = dx_detail.find_all(name='td')[1].text
                    daxue['bianhao'] = dx_detail.find_all(name='td')[2].text
                    daxue['zhishujigou'] = dx_detail.find_all(name='td')[3].text
                    daxue['diqu'] = dx_detail.find_all(name='td')[4].text
                    daxue['jibei'] = dx_detail.find_all(name='td')[5].text
                    yield daxue